#-------------------------------------------------------------#
# U.S. Security Threats Dataset
# Author: Jeff Allen (jeffreysallen1@gmail.com)
# Last updated: April 18, 2025
#-------------------------------------------------------------#

#-------------------------------------------------------------#
# 1) Importing
#-------------------------------------------------------------#

#--Load Libraries--#
library(tm) 
library(SnowballC)
library(reshape2)
library(countrycode)
library(tidyverse)
library(gridExtra)

#--Import country data--#
countries.df <- read.csv("countries_ws.csv", stringsAsFactors = FALSE)

#-------------------------------------------------------------#
# 2) Formatting
#-------------------------------------------------------------#

#-------------------------------------------------------------#
# 2.1) N-grams
#-------------------------------------------------------------#

## TM can handle n-grams, but it is more reliable just to have one word
## If n-grams break across lines, it may not pick them up

#-------------------------------------------------------------#
# 2.1.1) Country N-grams
#-------------------------------------------------------------#

str(countries.df)

## Remove USA
countries.df <- countries.df[-which(countries.df$iso3c == "USA"),]

## Create n-gram variable
(ngram <- grep(" ", countries.df$country.name))
countries.df$ngram <- 0
countries.df[ngram,]$ngram <- 1

## Split out n-grams
ngram.df <- countries.df[countries.df$ngram == 1,]
countries.df <- countries.df[countries.df$ngram == 0,]
rownames(ngram.df) <- 1:length(rownames(ngram.df))

##-- Rename n-grams --##
## Decision rules
# (1) Take the first word if it is not a general term (e.g., South)
# (2) Combine important countries that have no unique identifier (e.g., South Sudan -> SouthSudan)
# (3) Acronyms are difficult. Will need to change UK, UAE, and CAR later 
# Note on Congo: There is only one mention in 2016 of the ROC. Meanwhile, the DRC is referred to as DRC and Congo. Therefore, I will lean on Congo for the DRC.
# Congo strategy: change ROC to Republic of Congo, use Congo and Congolese for DRC
# Then, later, replace full DRC name and DRC with Congo

ngram.df[1,]$country.name <- "Unitedarabemirates" # United Arab Emirates
ngram.df[2,]$country.name <- "Burkina" # Burkina Faso
ngram.df[3,]$country.name <- "Bosnia" # Bosnia and Herzegovina
ngram.df[4,]$country.name <- "Centralafricanrepublic" # Central African Republic
ngram.df[5,]$country.name <- "Ivory" # Ivory Coast
ngram.df[6,]$country.name <- "Congo" # Democratic Republic of the Congo
ngram.df[7,]$country.name <- "Republicofcongo" # Republic of the Congo (No change)
ngram.df[8,]$country.name <- "Verde" # Cape Verde
ngram.df[9,]$country.name <- "Costa" # Costa Rica
ngram.df[10,]$country.name <- "Czech" # Czech Republic
ngram.df[11,]$country.name <- "Dominican" # Dominican Republic
ngram.df[12,]$country.name <- "Unitedkingdom" # United Kingdom
ngram.df[13,]$country.name <- "Equatorial" # Equatorial Guinea
ngram.df[14,]$country.name <- "Southkorea" # South Korea
ngram.df[15,]$country.name <- "Sri" # Sri Lanka
ngram.df[16,]$country.name <- "Macedonia" # North Macedonia
ngram.df[17,]$country.name <- "Zealand" # New Zealand
ngram.df[18,]$country.name <- "Papua" # Papua New Guinea
ngram.df[19,]$country.name <- "Korea" # North Korea
ngram.df[20,]$country.name <- "Saudi" # Saudi Arabia 
ngram.df[21,]$country.name <- "Solomon" # Solomon Islands
ngram.df[22,]$country.name <- "Sierra" # Sierra Leone
ngram.df[23,]$country.name <- "Salvador" # El Salvador
ngram.df[24,]$country.name <- "Southsudan" # South Sudan
ngram.df[25,]$country.name <- "Timor" # East Timor
ngram.df[26,]$country.name <- "Trinidad" # Trinidad and Tobago 
ngram.df[27,]$country.name <- "Southafrica" # South Africa

## Re-bind and remove N-grams
countries.df <- rbind(countries.df, ngram.df)
rm(ngram.df, ngram)

#-------------------------------------------------------------#
# 2.1.2) Nationality N-grams
#-------------------------------------------------------------#

## Create n-gram variable
(ngram <- grep(" ", countries.df$nationality))
countries.df$ngram <- 0
countries.df[ngram,]$ngram <- 1

## Split out n-grams
ngram.df <- countries.df[countries.df$ngram == 1,]
countries.df <- countries.df[countries.df$ngram == 0,]
rownames(ngram.df) <- 1:length(rownames(ngram.df))

##-- Rename n-grams --##
## Decision rules
# (1) Use NA for entries that will already be identified by replaced country name (e.g., Saudi Arabian will be counted with new country name Saudi)
# (2) Combine nationalities whose states have been combined (e.g., South African -> SouthAfrican)

ngram.df[1,]$nationality <- "Lucian" # Saint Lucian
ngram.df[2,]$nationality <- "Tomean" # Sao Tomean
ngram.df[3,]$nationality <- "Vincentian" # Saint Vincentian
ngram.df[4,]$nationality <- "Centralafrican" # Central African
ngram.df[5,]$nationality <- "Congolese" # Congolese or Congo
ngram.df[6,]$nationality <- NA # Congolese or Congo
ngram.df[7,]$nationality <- "Verdean" # Cabo Verdean
ngram.df[8,]$nationality <- NA # Costa Rican
ngram.df[9,]$nationality <- NA # Equatorial Guinean
ngram.df[10,]$nationality <- "Southkorean" # South Korean
ngram.df[11,]$nationality <- NA # Sri Lankan
ngram.df[12,]$nationality <- "Zealander" # New Zealand (using noun form - see CIA)
ngram.df[13,]$nationality <- NA # Papua New Guinean
ngram.df[14,]$nationality <- "Korean" # North Korean
ngram.df[15,]$nationality <- NA # Saudi Arabian
ngram.df[16,]$nationality <- NA # Solomon Islander
ngram.df[17,]$nationality <- NA # Sierra Leonean
ngram.df[18,]$nationality <- "Southsudanese" # South Sudanese
ngram.df[19,]$nationality <- "Southafrican" # South African

## Re-bind and remove N-grams
countries.df <- rbind(countries.df, ngram.df)
rm(ngram.df, ngram)

#-------------------------------------------------------------#
# 2.1.3) Capital City N-grams
#-------------------------------------------------------------#

## Create n-gram variable
(ngram <- grep(" ", countries.df$capital))
countries.df$ngram <- 0
countries.df[ngram,]$ngram <- 1

## Split out n-grams
ngram.df <- countries.df[countries.df$ngram == 1,]
countries.df <- countries.df[countries.df$ngram == 0,]
rownames(ngram.df) <- 1:length(rownames(ngram.df))

##-- Re-name N-grams--##
## Decision Rules
# Generally, take first word, unless it is a general term
# Should not re-name cities that will be identified by country name (e.g., "Mexico City")
ngram.df[1,]$capital <- NA # Andorra la Vella
ngram.df[2,]$capital <- "Buenos" # Buenos Aires
ngram.df[3,]$capital <- "Johns" # Saint John's
ngram.df[4,]$capital <- "Bandar" # Bandar Seri Begawan
ngram.df[5,]$capital <- "Addis" # Addis Ababa
ngram.df[6,]$capital <- "Georges" # St. George's
ngram.df[7,]$capital <- NA # Guatemala City
ngram.df[8,]$capital <- "Delhi" # New Delhi
ngram.df[9,]$capital <- "Phnom" # Phnom Penh
ngram.df[10,]$capital <- NA # Kuwait City
ngram.df[11,]$capital <- NA # Mexico City
ngram.df[12,]$capital <- "Louis" # Port Louis
ngram.df[13,]$capital <- "Kuala" # Kuala Lumpur
ngram.df[14,]$capital <- NA # Panama City 
ngram.df[15,]$capital <- "Vila" # Port Vila 
ngram.df[16,]$capital <- "Dhabi" # Abu Dhabi # First word could be confused with name (e.g., "abu bakr al-baghdadi")
ngram.df[17,]$capital <- "Domingo" # Santo Domingo
ngram.df[18,]$capital <- NA # San Salvador
ngram.df[19,]$capital <- "Port of Spain" # Port of Spain # No replacement
ngram.df[20,]$capital <- NA # Sao Tome
ngram.df[21,]$capital <- "San Jose" # San Jose # No replacement 
ngram.df[22,]$capital <- "Moresby" # Port Moresby

## Re-bind and remove N-grams
countries.df <- rbind(countries.df, ngram.df)
rm(ngram.df, ngram)

## Re-order alphabetically
countries.df <- with(countries.df, countries.df[order(country.name),])
rownames(countries.df) <- 1:nrow(countries.df)

#-------------------------------------------------------------#
# 2.2) Duplicates
#-------------------------------------------------------------#

countries.df$dups <- countries.df$country.name == countries.df$nationality
countries.df$dups2 <- countries.df$country.name == countries.df$capital

dups <- which(countries.df$dups == TRUE)
dups2 <- which(countries.df$dups2 == TRUE)

## Inspect duplicates
countries.df[dups, 1:4]

## Czech, Dominican, Liechtenstein: replace with NA
countries.df[countries.df$iso3c == "CZE",]$nationality <- NA
countries.df[countries.df$iso3c == "DOM",]$nationality <- NA
countries.df[countries.df$iso3c == "LIE",]$nationality <- NA

## Luxembourg and Singapore: use "noun" versions. See CIA statistics.
countries.df[countries.df$iso3c == "LUX",]$nationality <- "Luxembourger"
countries.df[countries.df$iso3c == "SGP",]$nationality <- "Singaporean"

## Remove duplicates and ngram columns
rm(dups, dups2)
countries.df <- countries.df[-(5:7)]

#----------------------------------------------------------------#
# 3) Import TA corpus
#----------------------------------------------------------------#

## Corpus
corpus.raw <- VCorpus(DirSource(directory = "ta", pattern = "ta")) 
corpus.raw

## Note 1: Please see the ReadMe file for an important note about the Threat Assessment documents. They should download as a separate folder named "ta". They should be left this way to run the code properly.
## Note 2: Because no TA was released in 2020, I use the 2019 TA for 2020.

#-----------------------------------------------------------------#
# 4) Pre-processing
#-----------------------------------------------------------------#

#-----------------------------------------------------------------#
# 4.1) Country-specific pre-processing
#-----------------------------------------------------------------#

## UAE function
UAE <- "Unitedarabemirates"
toUAE <- content_transformer(function (x , pattern) gsub(pattern, UAE, x))
corpus.prep <- tm_map(corpus.raw, toUAE, "United Arab Emirates")
corpus.prep <- tm_map(corpus.prep, toUAE, "UAE")
rm(UAE, toUAE)

## UK function
UK <- "Unitedkingdom"
toUK <- content_transformer(function (x , pattern) gsub(pattern, UK, x))
corpus.prep <- tm_map(corpus.prep, toUK, "United Kingdom")
corpus.prep <- tm_map(corpus.prep, toUK, "UK")
rm(UK, toUK)

## Central African Republic function
car <- "Centralafricanrepublic"
car2 <- "Centralafrican"
toCAR <- content_transformer(function (x , pattern) gsub(pattern, car, x))
toCAR2 <- content_transformer(function (x , pattern) gsub(pattern, car2, x))
corpus.prep <- tm_map(corpus.prep, toCAR, "Central African Republic")
corpus.prep <- tm_map(corpus.prep, toCAR, "CAR")
corpus.prep <- tm_map(corpus.prep, toCAR2, "Central African")
rm(car, car2, toCAR, toCAR2)

## South Korea function
sk <- "Southkorea"
sk2 <- "Southkorean"
toSK <- content_transformer(function (x , pattern) gsub(pattern, sk, x))
toSK2 <- content_transformer(function (x , pattern) gsub(pattern, sk2, x))
corpus.prep <- tm_map(corpus.prep, toSK, "South Korea")
corpus.prep <- tm_map(corpus.prep, toSK2, "South Korean")
rm(sk, sk2, toSK, toSK2)

## South Sudan function
ss <- "Southsudan"
ss2 <- "Southsudanese"
toSS <- content_transformer(function (x , pattern) gsub(pattern, ss, x))
toSS2 <- content_transformer(function (x , pattern) gsub(pattern, ss2, x))
corpus.prep <- tm_map(corpus.prep, toSS, "South Sudan")
corpus.prep <- tm_map(corpus.prep, toSS2, "South Sudanese")
rm(ss, ss2, toSS, toSS2)

## South Africa function
sa <- "Southafrica"
sa2 <- "Southafrican"
toSA <- content_transformer(function (x , pattern) gsub(pattern, sa, x))
toSA2 <- content_transformer(function (x , pattern) gsub(pattern, sa2, x))
corpus.prep <- tm_map(corpus.prep, toSA, "South Africa")
corpus.prep <- tm_map(corpus.prep, toSA2, "South African")
rm(sa, sa2, toSA, toSA2)

## DRC function
DRC <- "Congo"
toDRC <- content_transformer(function (x , pattern) gsub(pattern, DRC, x))
corpus.prep <- tm_map(corpus.prep, toDRC, "Democratic Republic of the Congo")
corpus.prep <- tm_map(corpus.prep, toDRC, "DRC")
rm(DRC, toDRC)

## ROC
ROC <- "Republicofcongo"
toROC <- content_transformer(function (x , pattern) gsub(pattern, ROC, x))
corpus.prep <- tm_map(corpus.prep, toROC, "Republic of Congo")
rm(ROC, toROC)

#-----------------------------------------------------------------#
# 4.2) General pre-processing
#-----------------------------------------------------------------#

## Define tospace function
toSpace <- 
  content_transformer(function (x , pattern) gsub(pattern, " ", x))

## Remove punctuation using gsub
corpus.prep <- tm_map(corpus.prep, toSpace, "[^[:alnum:]]")

## Convert to lower case
corpus.prep <- tm_map(corpus.prep, content_transformer(tolower))

## strip white space
corpus.prep <- tm_map(corpus.prep, stripWhitespace) 

## remove numbers
corpus <- tm_map(corpus.prep, removeNumbers) 

## Typical steps not used here
# Stop words
# Stemming

## Inspect
content(corpus[[14]])

rm(corpus.prep, corpus.raw)

#-------------------------------------------------------------------#
# 5) Set up dictionaries
#-------------------------------------------------------------------#

#-------------------------------------------------------------------#
# 5.1) Country dictionary
#-------------------------------------------------------------------#

countries.df$country.name <- tolower(countries.df$country.name)

dtm <- 
  DocumentTermMatrix(corpus,
                     list(dictionary = countries.df$country.name))

inspect(dtm[1:5,1:7])

dtm.mat <- as.matrix(dtm)

rm(dtm)

#-------------------------------------------------------------------#
# 5.2) Nationality dictionary
#-------------------------------------------------------------------#

countries.df$nationality <- tolower(countries.df$nationality)

## Omit NA
nat.na <- which(is.na(countries.df$nationality))
nationality <- countries.df[-nat.na,]$nationality

dtm.nat <- 
  DocumentTermMatrix(corpus, list(dictionary = nationality))

inspect(dtm.nat[1:5,1:7])

dtm.nat.mat <- as.matrix(dtm.nat)

rm(dtm.nat)

#-------------------------------------------------------------------#
# 5.3) Capitals dictionary
#-------------------------------------------------------------------#

countries.df$capital <- tolower(countries.df$capital)

## Omit NA
cap.na <- which(is.na(countries.df$capital))
capital <- countries.df[-cap.na,]$capital

dtm.cap <- 
  DocumentTermMatrix(corpus, list(dictionary = capital))

inspect(dtm.cap[1:5,1:7])

dtm.cap.mat <- as.matrix(dtm.cap)

rm(dtm.cap)

#-------------------------------------------------------------------#
# 6) Frequency Data Frame
#-------------------------------------------------------------------#

#-------------------------------------------------------------------#
# 6.1) Country name frequency
#-------------------------------------------------------------------#

##-- Initialize data frame --##
freq.df <- data.frame(year = 2006:2025)
freq.df <- cbind(freq.df, dtm.mat)
rownames(freq.df) <- 1:nrow(freq.df)

##-- Melt --##
freq.df.l <- melt(freq.df, id.vars = "year")

##-- Rename and Merge --##
colnames(freq.df.l)[2:3] <- c("country.name", "c_count")
freq.df.l <- merge(freq.df.l, countries.df[1:2], all.x = TRUE)
freq.df.l <- freq.df.l[-1]

#-------------------------------------------------------------------#
# 6.2) Country nationality frequency
#-------------------------------------------------------------------#

##-- Initialize data frame --##
freq.nat.df <- data.frame(year = 2006:2025)
freq.nat.df <- cbind(freq.nat.df, dtm.nat.mat)
rownames(freq.nat.df) <- 1:nrow(freq.nat.df)

##-- Melt --##
freq.nat.df.l <- melt(freq.nat.df, id.vars = "year")

##-- Rename and Merge --##
colnames(freq.nat.df.l)[2:3] <- c("nationality", "nat_count")
freq.nat.df.l <- merge(freq.nat.df.l, countries.df[,c(2,4)], all.x = TRUE)

##-- Reformat and merge --##
freq.nat.df.l <- freq.nat.df.l[-1]
count.df <- merge(freq.df.l, freq.nat.df.l, 
                  by = c("iso3c", "year"), all = TRUE)

#-------------------------------------------------------------------#
# 6.3) Capital frequency
#-------------------------------------------------------------------#

##-- Initialize data frame --##
freq.cap.df <- data.frame(year = 2006:2025)
freq.cap.df <- cbind(freq.cap.df, dtm.cap.mat)
rownames(freq.cap.df) <- 1:nrow(freq.cap.df)

##-- Melt --##
freq.cap.df.l <- melt(freq.cap.df, id.vars = "year")

##-- Rename and Merge --##
colnames(freq.cap.df.l)[2:3] <- c("capital", "cap_count")
freq.cap.df.l <- 
  merge(freq.cap.df.l, countries.df[c(2,3)], all.x = TRUE)

##-- Reformat and merge --##
freq.cap.df.l <- freq.cap.df.l[-1]
count.df <- merge(count.df, freq.cap.df.l,by = c("iso3c", "year"),
                  all = TRUE)

## Clean up
rm(corpus, dtm.mat, dtm.nat.mat, freq.df, freq.df.l,
   freq.nat.df, freq.nat.df.l, nat.na, nationality,
   freq.cap.df, freq.cap.df.l, cap.na, capital, dtm.cap.mat,toSpace)

#-------------------------------------------------------------------#
# 7) Frequency Analysis
#-------------------------------------------------------------------#

##-- Add country count and nat count --##
count.df$total_ta <- rowSums(count.df[-(1:2)], na.rm = TRUE)

##-- Cast year ~ iso3c --##
count.df.w <- dcast(count.df, year ~ iso3c, value.var = "total_ta")

##-- Convert to matrix --##
count.mat <- as.matrix(count.df.w[-1])

##-- Perform matrix calculations--##
count.mat.p <- count.mat / rowSums(count.mat) * 100

## Create data frames
prop.df <- data.frame(year = 2006:2025)

## Bind data frames
prop.df <- cbind(prop.df, count.mat.p)

## Melt data frames
security.df <- melt(prop.df, id.vars = "year")

## Rename
colnames(security.df)[2:3] <- c("iso3c", "threat_share")

## Clean up
rm(count.df, count.df.w, count.mat, count.mat.p, countries.df, prop.df)

write.csv(security.df, "US_Threats.csv", row.names = FALSE)
